Utility bills have 3 components
Demand and energy charges tend to have different rates for secondary, primary, and transmission rates, depending on where the customer is interconnected. Typical residential and commercial enduses are secondary. Larger electricity customers that can connect directly to transmission or upstream of a distribution substation (primary) typically get lower rates.
2 data sets were used for this exercise.
In order to access the Utility rate database, you must first register for a (free) API key. After doing so, create a .env file setting OPENEI_APIKEY in the runtime directory of this notebook. Alternatively, you can set the OPENEI_APIKEY environment variable manually.
Example ./.env file:
OPENEI_APIKEY=REPLACE_ME
import os
import jmespath
import requests
import shutil
import vega_datasets
import altair as alt
import dask.dataframe as dd
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
#####################################################################
# Configurable Options
# One of "A10" or "A10_TOU"
SCHEDULE = "A10"
# One of "Education", "Commercial Property", "Light Industrial", "Food Sales & Storage"
INDUSTRY = "Education"
# Results directory to write tables of results
results_dir = Path("./results/")
#####################################################################
# Hard-coded A10 Rates for lab assignment
# NOTE: OpenEI URDB also supports dynamic lookups based on location
SCHEDULES = {
"A10": "5e162e2a5457a3d50873e3af",
"A10_TOU": "5e1630285457a3d35f73e3af"
}
BASE_DATA_URL = "https://open-enernoc-data.s3.amazonaws.com/anon"
OPENEI_API_URL = "https://api.openei.org"
# Load env to get API credentials
load_dotenv()
OPENEI_APIKEY = os.environ.get("OPENEI_APIKEY")
# Look at Site Data
sites = pd.read_csv(f"{BASE_DATA_URL}/meta/all_sites.csv")
sites.head()
# Visualize histogram of building types
site_summary = alt.Chart(sites).mark_bar().encode(
x=alt.X('SUB_INDUSTRY:N', title="", sort=sites.groupby("SUB_INDUSTRY")["SITE_ID"].count().sort_values(ascending=False).index.tolist()),
y=alt.Y('count(SITE_ID)', title="Number of Sites"),
).facet(
columns=1,
align="all",
column=alt.Column("INDUSTRY", title="Industry",
header=alt.Header(labelOrient="top", titleOrient="top"),
sort=sites.groupby("INDUSTRY")["SUB_INDUSTRY"].nunique().sort_values(ascending=False).index.tolist()
)
).resolve_scale(
x="independent"
)
# Plot Site Locations
states = alt.topo_feature(vega_datasets.data.us_10m.url, feature='states')
site_locations = alt.layer(
# Set US States Background
alt.Chart(states).mark_geoshape(
fill='lightgrey',
stroke='white',
).project('albersUsa'),
# Layer site locations
alt.Chart(sites).mark_point().encode(
latitude='LAT',
longitude='LNG',
tooltip=['INDUSTRY', 'SUB_INDUSTRY'],
color=alt.Color('SUB_INDUSTRY', title="Sub Industry", scale=alt.Scale(scheme="tableau10"), legend=alt.Legend(orient="left"))
)
)
# Plot Summary View
alt.hconcat(site_locations, site_summary).configure_axis(
grid=False
)
Utility Bill Calculation:
Load Duration Curve
Seasonal Load
# Get All Site-level building energy use data, and cache it to local disk storage for this exercise
def refresh_site_energy_use_data(output_dir, hard_refresh=False, **kwargs):
"""Warning: SLOW! - Refresh all building energy use data from Enernoc as a local parquet directory.
Args:
output_dir (str, pathlib.Path): Output directory for parquet directory
hard_refresh (bool): If True, output directory is removed before writing
kwargs: Additional arguments to pass onto dask.DataFrame.to_parquet
Warning:
Non-parquet related files should not be stored in the parquet output directory.
"""
output_dir = Path(output_dir)
# Check if our output dir looks like an existing parquet directory
has_existing_data = (output_dir / "_metadata").exists()
if hard_refresh and has_existing_data:
# Remove output directory if it looks like a parquet directory with _metadata
shutil.rmtree(output_dir)
# Create output directory if it doesn't already exist
output_dir.mkdir(exist_ok=True)
if not has_existing_data:
# Create list of filepaths
data_files = (f"{BASE_DATA_URL}/csv/" + sites["SITE_ID"].astype(str) + ".csv").to_list()[:5]
# Collect all data from Enernoc
tmp = dd.read_csv(list(data_files), include_path_column=True)
# Extract site id based on path
tmp["site_id"] = tmp["path"].str.extract(r"(?P<site_id>\d+(?=\.csv))")["site_id"].astype(int)
# Partition on site-ids for simple read-in filters
tmp.to_parquet(str(output_dir), partition_on=["site_id"], **kwargs)
refresh_site_energy_use_data("./data/energy_use/")
# Get rate data
schedule = SCHEDULES[SCHEDULE]
# Get Rate data from OpenEI Utility Rate Database API
url = f"{OPENEI_API_URL}/utility_rates?api_key={OPENEI_APIKEY}&getpage={schedule}&format=json&version=7&detail=full"
rates = requests.get(url).json()
# Read Energy Use Data
# Filter for desired sites, in this case, let's filter for all Education building types
site_ids = sites.query(f"INDUSTRY == '{INDUSTRY}'")["SITE_ID"].unique().tolist()
# Parquet supports Hive-based filtering on partitions
filters=[('site_id', 'in', site_ids)]
# DEBUG: Use single site
# filters = [('site_id', '==', 10)]
# Read parquet dataset into single DataFrame
data = dd.read_parquet("./data/energy_use/", filters=filters).compute()
# Merge energy use data by site with site metadata
data = data.merge(sites.rename(columns={"SITE_ID": "site_id"}), on="site_id")
# Calculate a naive local times to allow for multiple timezones
utc_offset = pd.to_timedelta(data['TZ_OFFSET'].map("{}:00".format), unit="hour")
data["local_datetime"] = data['dttm_utc'] + utc_offset
For the A10 rate, utility bills are broken across 3 billing categories:
Generalizing this beyond a handful of fixed rates may require parsing more data about the billing structure from arbitrary utility rate schedules from all US Utilities.
def parse_energy_schedule(data, rate_mapping):
"""Parse an openEI URDB energy schedule.
"""
schedule = pd.melt(
pd.DataFrame(data).rename_axis("month").reset_index(),
id_vars=["month"],
var_name="hour",
value_name="schedule"
)
# Handle 0-indexing, note hours are 0-indexed but not months!
schedule["month"] += 1
# Map rates
schedule["rate"] = schedule["schedule"].map(rate_mapping)
return schedule
# Get Energy Rates and Schedules
energy_rates = pd.Series(jmespath.search("items[0].energyratestructure[].rate", rates))
# Combine weekday and weekend schedules into one table
energy_schedule = pd.concat(
[
parse_energy_schedule(jmespath.search("items[0].energyweekendschedule", rates), energy_rates).assign(weekday=False),
parse_energy_schedule(jmespath.search("items[0].energyweekdayschedule", rates), energy_rates).assign(weekday=True)
]
)
# Merge (up to Hourly) Energy Rates
data["energy_rate"] = data[["local_datetime"]].merge(
energy_schedule,
left_on=[~data["local_datetime"].dt.dayofweek.isin([5,6]), data["local_datetime"].dt.month, data["local_datetime"].dt.hour],
right_on=["weekday", "month", "hour"],
how="left"
)["rate"]
# Calculate Energy Charges
data["energy_cost"] = data["value"] * data["energy_rate"]
energy_cost = data.groupby([pd.Grouper(key="local_datetime", freq="M"), "site_id"]).agg(
{"value": "sum", "energy_cost": "sum", "energy_rate": "mean"}
).rename(columns={"energy_rate": "avg_energy_rate"})
# Get Demand charges and Demand Schedule
demand_rates = pd.Series(jmespath.search("items[0].flatdemandstructure[].rate", rates))
demand_schedule = pd.DataFrame({"schedule": jmespath.search("items[0].flatdemandmonths", rates)}).rename_axis("month").reset_index()
demand_schedule["month"] += 1
demand_schedule["rate"] = demand_schedule["schedule"].map(demand_rates)
# Merge Demand Rates
data["demand_rate"] = data["local_datetime"].dt.month.map(demand_schedule.set_index("month")["rate"])
# Calculate Demand Charges - group over all 15-min intervals
avg_15min_demand = data.groupby(["site_id", pd.Grouper(key="local_datetime", freq="15 min")]).agg({"demand_rate": "first", "value" : "sum"})
# Calculate demand within each 15 min interval: kW = kWh/hrs
avg_15min_demand["kW"] = avg_15min_demand["value"] / (15/60)
# Take max 15-min avg demand over each "billing period" (which we assume to be Month), then multiply the demand rate by the max demand in that month
demand_cost = avg_15min_demand.groupby([pd.Grouper(level="local_datetime", freq="M"), "site_id"]).agg({"demand_rate": "first", "kW": "max"})
demand_cost["demand_cost"] = demand_cost["kW"] * demand_cost["demand_rate"]
# Get Fixed Charges ($/Month/meter)
meter_charge = jmespath.search("items[0].fixedchargefirstmeter", rates)
# Calculate Fixed Daily Charges: Calculate the number of days in each month for each site, then multiply by the fixed meter charge.
meter_cost = data.assign(num_days = data["local_datetime"].dt.day).groupby(["site_id", pd.Grouper(key="local_datetime", freq="M")])[["num_days"]].nunique()
meter_cost["meter_cost"] = meter_cost["num_days"]*meter_charge
meter_cost = meter_cost.swaplevel().sort_index()
# Combine all cost data together so we can add it later
total_cost = pd.concat([meter_cost, demand_cost, energy_cost], axis=1)
total_cost["total_cost"] = total_cost["meter_cost"] + total_cost["energy_cost"] + total_cost["demand_cost"]
# Transform data to 3rd normal form for plotting later on
costs_by_type = pd.melt(
total_cost.reset_index()[["local_datetime", "site_id", "meter_cost", "demand_cost", "energy_cost"]].rename(
columns={"demand_cost": "Demand (kW)", "energy_cost": "Energy (kWh)", "meter_cost": "Meter ($/day/billing cycle)"}),
id_vars=["local_datetime", "site_id"],
var_name="charge_category",
value_name="cost"
)
# Write our utility bills out to disk.
total_cost.reset_index().to_csv(results_dir / f"{INDUSTRY}-utility_bills.csv", index=False)
Seasonal demand is calculated by taking the maximum power observed in any 15-min billing period over each hour for each site. Once an hourly peak demand is calculated, we mean over each hour of the day over the entire billing period for each site and "billing season".
# Reuse our calculated avg 15 min demand from the utility bill calculation
max_seasonal_demand = avg_15min_demand.groupby(["site_id", pd.Grouper(level="local_datetime", freq="H")]).agg({"kW": "max"}).reset_index()
demand_by_season = max_seasonal_demand.merge(demand_schedule, left_on=max_seasonal_demand["local_datetime"].dt.month, right_on="month")
# Group by hour of day, and take the average peak over each hour across sites and schedule type (winter/summer)
demand_by_season["hour_of_day"] = demand_by_season["local_datetime"].dt.hour
demand_by_season = demand_by_season.groupby(["site_id", "schedule", "hour_of_day"]).agg({"kW": "mean"}).reset_index()
# Write our utility bills out to disk.
results_dir = Path("./results/")
results_dir.mkdir(exist_ok=True)
total_cost.reset_index().to_csv(results_dir / f"{INDUSTRY}-utility_bills.csv", index=False)
demand_by_season.head()
# Calculate a load duration curve by grouping over each site, and each hour then taking the maximum 15-min peak within each hour
max_hourly_demand = avg_15min_demand.groupby(["site_id", pd.Grouper(level="local_datetime", freq="H")]).agg({"kW": "max"})
# Sort by the peak demand first in descending order, and then groupby and add a dummy index or "rank"
load_curve = max_hourly_demand.sort_values(
by="kW", ascending=False
).groupby(level="site_id").apply(
lambda df: df.reset_index()
)[["kW", "local_datetime"]].reset_index().rename(columns={"level_1": "idx"})
load_curve.head()
Below we configure and plot key results, and a few data samples as well.
# We are generating potentially hundreds of 8760s
# this is kind of a lot of data, so we send data io into a slower, but more forgiving background process
# alt.data_transformers.enable('data_server')
# If running this in binder, use this instead!
# alt.data_transformers.enable('data_server_proxied')
# For publishing, swap to default data transformer to bundle data with notebook
alt.data_transformers.enable('default', max_rows=9999999)
# Define global inputs
input_dropdown = alt.binding_select(options=site_ids)
selection = alt.selection_single(fields=['site_id'], bind=input_dropdown,
name="Site ID", empty="none", init={'site_id': site_ids[0]})
# Add a plot for utility bills
utility_bill_dashboard = alt.Chart(costs_by_type).mark_bar().encode(
x=alt.X("yearmonth(local_datetime)", title="Month"),
y=alt.Y("cost", title="Total Cost ($)"),
color=alt.Color("charge_category", title="Billing Category"),
tooltip=[alt.Tooltip("charge_category", title="Billing Category"), alt.Tooltip("cost", title="Cost ($)")],
).add_selection(selection).transform_filter(
selection
)
# Add a plot for the load curve
load_curve_chart = alt.Chart(load_curve).mark_line().encode(
y=alt.Y("kW", title="Hourly Peak 15-min Demand"),
x=alt.X("idx", title="Rank", axis=alt.Axis(labels=False)),
tooltip=["kW", "local_datetime"],
).add_selection(
selection
).transform_filter(
selection
)
# Add a plot for "seasonal" demand, where seasons are billing seasons
# They happen to roughly correspond to "winter" and "summer"
seasonal_demand = alt.Chart(demand_by_season).mark_line().encode(
x=alt.X("hour_of_day", title="Hour"),
y=alt.Y("kW", title="Average peak kW"),
color=alt.Color("schedule:N", title="Season")
).add_selection(
selection
).transform_filter(selection)
Load duration curve, average load and calculated utility bills over the analysis period
dash = (load_curve_chart | utility_bill_dashboard) & seasonal_demand
dash.resolve_scale(color='independent')